'''
#==========================================================================================#
#
# Word Embeddings for the Analysis of Ideological Placement in Parliamentary Corpora
# Political Analysis
# Ludovic Rheault and Christopher Cochrane
#
#===========================================================================================#

This file reproduces the results printed in the main text of the paper.  We recommend using a Python virtual environment to avoid compatibility issues with the libraries needed, which are listed in the requirements.txt file.  Below is a summary of the key steps to set up a virtual environment and run the present file.  

We recommend using a version of Python 3.5 or above.  Note that for some operating systems, the python3 and pip3 commands may be needed below if Python 3 is not the default language associated with the python command.

Steps:

#1. After downloading and extracting the dataverse repository files, enter the replication folder.
unzip dataverse_files.zip
cd dataverse_files

#2a. Install the Python virtualenv package (if needed).
pip install virtualenv

#2b. Create and activate a virtual environment. 
virtualenv partyembed
source partyembed/bin/activate

#(On Windows: partyembed\Scripts\activate)

#3. Install requirements.
pip install -r requirements.txt

#4. Run the main results script (note, figures will be overwritten in the current directory).
python main_results.py

#5. To reproduce the results from the online appendix.
python appendix_results.py

#6. Deactivate the virtual environment.
deactivate

#7. Polarization figures (must be run after appendix_results.py).
Rscipt polarization.R
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.decomposition import PCA
from gensim.models.doc2vec import Doc2Vec
import utils.labels as labels
import utils.plots as plots
from utils.interpret import Interpret
from utils.accuracy import pairwise_accuracy

#=========================================================================================#
# Figure 2: Party Placement in the US House (1873-2016)
#=========================================================================================#
# Loading pretrained model and label names:
model = Doc2Vec.load('models/house200')
label_dict = labels.party_labels('USA')
fullnames, parties, cols, mkers = labels.party_tags(model, 'USA')
labs = [label_dict[p] for p in parties]
M = model.vector_size; P = len(parties)

# Fitting PCA dimensionality reduction model for plotting:
z = np.zeros((P,M))
for i in range(P):
    z[i,:] = model.docvecs[parties[i]]
pca = PCA(n_components = 2)
Z = pd.DataFrame(pca.fit_transform(z), columns = ['dim1', 'dim2'])
Z['label'] = labs

# Re-orienting the axes for substantive interpretation:
rev1 = False; rev2 = False;
if Z[Z.label=='Dem 2015'].dim1.values[0] > Z[Z.label=='Rep 2015'].dim1.values[0]:
    Z['dim1'] = Z.dim1 * (-1)
    rev1 = True
if Z[Z.label=='Dem 2015'].dim2.values[0] < Z[Z.label=='Rep 2015'].dim2.values[0]:
    Z['dim2'] = Z.dim2 * (-1)
    rev2 = True

# Reproducing Figure 2
# 2a
plots.plot_2a(Z, labs, cols, mkers, savepath='figures/figure2a.pdf')
print("Saved Figure 2a to file figures/figure2a.pdf")
# 2b
plots.plot_timeseries(Z, fullnames, cols, dimension=1, savepath='figures/figure2b.pdf', legend='upper left')
print("Saved Figure 2b to file figures/figure2b.pdf")
# 2c
plots.plot_timeseries(Z, fullnames, cols, dimension=2, savepath='figures/figure2c.pdf', legend='lower left')
print("Saved Figure 2c to file figures/figure2c.pdf")

#=========================================================================================#
# Table 1: Interpreting PCA Axes
#=========================================================================================#
Interpret(model, parties, pca, Z, labs, rev1=rev1, rev2=rev2, min_count=100, max_count = 1000000, max_features = 50000).top_words_list(20)
print("Saved Table 1 to file tables/table1.txt")

#=========================================================================================#
# Figure 3: Party Placement in Britain (1935-2014) and Canada (1901-2017)
#=========================================================================================#
# Loading other country models:
# UK
ukmodel = Doc2Vec.load('models/uk200')
uk_dict = labels.party_labels('UK')
uknames, ukparties, ukcols, ukmkers = labels.party_tags(ukmodel, 'UK')
uklabs = [uk_dict[p] for p in ukparties]
Muk = ukmodel.vector_size; Puk = len(ukparties)

# Fitting PCA dimensionality reduction model for plotting:
zuk = np.zeros((Puk,Muk))
for i in range(Puk):
    zuk[i,:] = ukmodel.docvecs[ukparties[i]]
pca_uk = PCA(n_components = 2)
Zuk = pd.DataFrame(pca_uk.fit_transform(zuk), columns = ['dim1', 'dim2'])
Zuk['label'] = uklabs

# Re-orienting the first axis for substantive interpretation:
if Zuk[Zuk.label=='Labour 2010'].dim1.values[0] > Zuk[Zuk.label=='Cons 2010'].dim1.values[0]:
    Zuk['dim1'] = Zuk.dim1 * (-1)

# Canada
canmodel = Doc2Vec.load('models/canada200')
can_dict = labels.party_labels('Canada')
cannames, canparties, cancols, canmkers = labels.party_tags(canmodel, 'Canada')
canlabs = [can_dict[p] for p in canparties]
Mcan = canmodel.vector_size; Pcan = len(canparties)

# Fitting PCA dimensionality reduction model for plotting:
zcan = np.zeros((Pcan, Mcan))
for i in range(Pcan):
    zcan[i,:] = canmodel.docvecs[canparties[i]]
pca_can = PCA(n_components = 2)
Zcan = pd.DataFrame(pca_can.fit_transform(zcan), columns = ['dim1', 'dim2'])
Zcan['label'] = canlabs

# Re-orienting the first axis for substantive interpretation:
if Zcan[Zcan.label=='NDP 2015'].dim1.values[0] > Zcan[Zcan.label=='Cons 2015'].dim1.values[0]:
    Zcan['dim1'] = Zcan.dim1 * (-1)

# Figure 3a
plots.plot_3a(Zuk, uklabs, ukcols, ukmkers, savepath='figures/figure3a.pdf')
print("Saved Figure 3a to file figures/figure3a.pdf")
# Figure 3b
plots.plot_3b(Zcan, canlabs, cancols, canmkers, savepath='figures/figure3b.pdf')
print("Saved Figure 3b to file figures/figure3b.pdf")

#=========================================================================================#
# Table 2: Accuracy of Party Placement against Gold Standards
#=========================================================================================#
# Collect results
# 1. USA
# Adding Senate model
senmodel = Doc2Vec.load('models/senate200')

# Fitting PCA dimensionality reduction model for Senate:
zsen = np.zeros((P,M))
for i in range(P):
    zsen[i,:] = senmodel.docvecs[parties[i]]
pca_sen = PCA(n_components = 2)
Zsen = pd.DataFrame(pca_sen.fit_transform(zsen), columns = ['dim1', 'dim2'])
Zsen['label'] = labs

# Joining projections with external gold standards
gold_house = pd.read_csv('data/goldstandard_house.csv').merge(Z, on='label', how='left')
gold_senate = pd.read_csv('data/goldstandard_senate.csv').merge(Zsen, on='label', how='left')
gold_uk = pd.read_csv('data/goldstandard_uk.csv').merge(Zuk, on='label', how='left')
gold_can = pd.read_csv('data/goldstandard_canada.csv').merge(Zcan, on='label', how='left')

gold_scores = ['voteview', 'experts_stand', 'rile', 'vanilla', 'legacy']
countries = [('US House', gold_house),
             ('US Senate', gold_senate),
             ('Canada', gold_can),
             ('Britain', gold_uk)]
results = np.zeros(( 10, 4 ), dtype=object)

for idx, (c, df) in enumerate(countries):
    jdx = 0
    for g in gold_scores:
        if g=='voteview' and 'voteview' not in df.columns:
            results[jdx:(jdx+2),idx] = ['','']
        else:
            temp = df[pd.notnull(df[g])]
            corr = '%0.3f' %temp.dim1.corr(temp[g])
            acc = '%0.2f%%' %pairwise_accuracy(temp[g].tolist(), temp.dim1.tolist())
            results[jdx:(jdx+2),idx] = [corr, acc]
        jdx += 2

results = pd.DataFrame(results, columns = [c for c,df in countries])
results.insert(loc=0,column='Metric',value=['Correlation', 'Accuracy']*5)
results.insert(loc=0,column='Gold Standard',value=[item for item in ['Voteview', 'Experts Surveys', 'rile', 'vanilla', 'legacy'] for i in range(2)])

with open('tables/table2.txt', 'w') as f:
    print("Table 2: Accuracy of Party Placement against Gold Standards\n"+"-"*83, file=f)
    print(tabulate(results, headers="keys", showindex=False, tablefmt="orgtbl"), file=f)
print("Saved Table 2 to file tables/table2.txt")

#=========================================================================================#
# Figure 4: Comparison with WordFish Estimates
#=========================================================================================#
# Loading fitted WordFish models
wf_full = pd.read_csv("data/wf_full.csv")
wf_short = pd.read_csv("data/wf_short.csv")

# Figure 4a
plots.plot_4a(wf_full, savepath='figures/figure4a.pdf')
print("Saved Figure 4a to file figures/figure4a.pdf")
# Figure 4b
plots.plot_4b(wf_short, savepath='figures/figure4b.pdf')
print("Saved Figure 4b to file figures/figure4b.pdf")

#=========================================================================================#
# Table 3: Accuracy of Party Placement in the US House: WordFish and Party Embeddings
#=========================================================================================#
wff_reshaped = pd.DataFrame(wf_full.democrat.tolist() + wf_full.republican.tolist(), columns=['wf_long'])
wff_reshaped['label'] = ['Dem ' + str(y) for y in wf_full.year] + ['Rep ' + str(y) for y in wf_full.year]
wfs_reshaped = pd.DataFrame(wf_short.democrat.tolist() + wf_short.republican.tolist(), columns=['wf_short'])
wfs_reshaped['label'] = ['Dem ' + str(y) for y in wf_short.year] + ['Rep ' + str(y) for y in wf_short.year]
gold_house = gold_house.merge(wff_reshaped, on='label', how='left')
gold_house = gold_house.merge(wfs_reshaped, on='label', how='left')

results = np.zeros(( 3, 4 ), dtype=object)
for idx, placement in enumerate(['wf_long', 'dim1']):
    results[1:3,idx] = ['%0.3f' %gold_house[placement].corr(gold_house.voteview),
                       '%0.2f%%' %pairwise_accuracy(gold_house.voteview.tolist(), gold_house[placement].tolist())]
temp = gold_house[pd.notnull(gold_house.wf_short)]
for idx, placement in enumerate(['wf_short', 'dim1']):
    results[1:3,idx+2] = ['%0.3f' %temp[placement].corr(temp.voteview),
                       '%0.2f%%' %pairwise_accuracy(temp.voteview.tolist(), temp[placement].tolist())]
results[0,:] = ['WordFish', 'Embeddings']*2
results = pd.DataFrame(results, columns=['1921-2016', '1921-2016', '2007-2016','2007-2016'])
results.insert(loc=0, column='Metric', value=['','Correlation','Pairwise Accuracy'])

with open('tables/table3.txt', 'w') as f:
    print("Table 3: Accuracy of Party Placement in the US House: WordFish and Party Embeddings\n"+"-"*76, file=f)
    print(tabulate(results, headers="keys", showindex=False, tablefmt="orgtbl"), file=f)
print("Saved Table 3 to file tables/table3.txt")

#=========================================================================================#
# Figure 5: Ideological Placement of Senators (114th Congress)
#=========================================================================================#
# Loading model with legislator level embeddings
senator_model = Doc2Vec.load('models/cp_senate200')
dems = [d for d in senator_model.docvecs.offset2doctag if 'D_114' in d]
reps = [d for d in senator_model.docvecs.offset2doctag if 'R_114' in d]
sm_parties = dems + reps
nsm = len(sm_parties)
zsm = np.zeros((nsm, 200))
for i in range(nsm):
    zsm[i,:] = senator_model.docvecs[sm_parties[i]]
pca_sm = PCA(n_components=2)
Zsm = pd.DataFrame(pca_sm.fit_transform(zsm), columns = ['dim1', 'dim2'])

# Figure 5
plots.plot_5(Zsm, dems, reps, savepath='figures/figure5.pdf')
print('Saved Figure 5 to file figures/figure5.pdf')

#=========================================================================================#
# Table 4: Accuracy of Senator Ideological Placement
#=========================================================================================#
# Loading Senator-level gold standards for Congress 114
gold_senators = pd.read_csv('data/goldstandard_senate114.csv')
results = np.zeros(( 6, 2 ), dtype=object)
gold_scores = ['nominate_dim1', 'nokken_poole_dim1', 'ACU_2016', 'ACU_2015', 'ACU_Life', 'govtrack']
for idx, g in enumerate(gold_scores):
    corr = '%0.3f' %gold_senators.senator_embedding_pca1.corr(gold_senators[g])
    acc = '%0.2f%%' %pairwise_accuracy(gold_senators[g].tolist(), gold_senators.senator_embedding_pca1.tolist())
    results[idx,:] = [corr, acc]

results = pd.DataFrame(results, columns=['Correlation','Pairwise Accuracy'])
results.insert(loc=0, column='Gold Standard', value=['DW-NOMINATE',
                                                     'Nokken-Poole',
                                                     'ACU 2016',
                                                     'ACU 2015',
                                                     'ACU Life',
                                                     'GovTrack'])

with open('tables/table4.txt', 'w') as f:
    print("Table 4: Accuracy of Senator Ideological Placement", file=f)
    print("-"*57, file=f)
    print(tabulate(results, headers="keys", showindex=False, tablefmt="orgtbl"), file=f)
print("Saved Table 4 to file tables/table4.txt")
